# =============================================================================
# =============================================================================
# Code to download ACS data at the tract level
#       This data is used to identify potential demographic trends in neighborhoods where a plasma center opens
# =============================================================================
# =============================================================================


# =============================================================================
# Mark the counties / MSAs with the largest changes in plasmapheresis centers. 
# =============================================================================

api_key='YOUR_API_KEY' #Create your API key here: https://api.census.gov/data/key_signup.html  see also https://www.census.gov/data/developers/guidance/api-user-guide.html
os.chdir(cdd['p_c']+r'/Census')
import CensusCustom as censusd2
from census import Census
c=Census(api_key)

def downsize(df,types):
    for var in list(set(list(df)) & set(list(types))):
#        print(var)
        if not isnan2(types[var]):
            if types[var]=='float':
                if not np.issubdtype(df[var].dtype, np.number):
                    df[var]=df[var].astype(str).replace(regex=True,to_replace=r'-',value=r'').replace('.','').replace('nan','').replace('&','').replace(' ','').replace('None','')
                df[var]=pd.to_numeric(df[var])
            elif types[var]=='integer':
                if not np.issubdtype(df[var].dtype, np.number):
                    df[var]=df[var].astype(str).replace(regex=True,to_replace=r'-',value=r'').replace('.','').replace('nan','').replace('&','').replace(' ','').replace('None','')
                df[var]=pd.to_numeric(df[var])
            elif types[var]=='cat':
                df[var]=df[var].astype('category')
            elif types[var]=='date':
                df[var]=pd.to_datetime(df[var], format='%Y-%m-%d')
    return df




# =============================================================================
# ACS __ (5/year files available for 2010-> at the tract level but only 2013-> at the cbg level)
# =============================================================================

#Import the list of variables I want to use (their types and naming)
acs_vars = pd.read_excel(cdd['p_c'] + r'\2_American Community Survey\ACS_Variables.xlsx',sheet_name='ACS5Y')
# acs_vars = acs_vars[(acs_vars.dne==0)&(acs_vars.Available.isin(['cbg','tract']))]
acs_vars = acs_vars[(acs_vars.dne==0)]
acs_rndict = dict(zip(acs_vars['Variable'].to_list(),acs_vars['Rename'].to_list()))
acs_ddict =  dict(zip(acs_vars['Rename'].to_list(),acs_vars['Type'].to_list()))


def DistributedCensusTract(state,acs_fields,as_of):
    error=True
    for i in range(0,10):
        try:
            c=Census(api_key)
            temp = pd.DataFrame(c.acs5.state_county_tract(fields=tuple(['NAME']+acs_fields),state_fips=state,county_fips='*', tract='*', year=as_of))
            temp['year'] = as_of
            temp.to_parquet(cdd['p_d_acs'] + r'/tract/raw/acs_'+str(as_of)+'_'+state+'.parquet')
            error=False
        except:
            #wait for a second after a failed attempt
            time.sleep(1)
            pass
        if not error:          #If we successfully retreived the data then break the loop.
            break


sl = list(us.states.mapping('fips', 'abbr').keys())
for y in range(2010,2022):
    print(y)
    acs_fields=acs_vars[acs_vars.depth<=y]['Variable'].tolist()
    sls = ldiff(sl,[f.split('_')[2].split('.')[0] for f in os.listdir(cdd['p_d_acs'] + r'//tract') if f.split('_')[1]==str(y)])
    Parallel(n_jobs=10,prefer='processes')(delayed(DistributedCensusTract)(state=s,acs_fields=acs_fields,as_of=y) for s in tqdm(sls))



# =============================================================================
# Adjust variables to get them in useful forms
# =============================================================================

#Import and merge all of the annual ACS files
acs=pd.concat([pd.read_parquet(cdd['p_d_acs'] + r'//tract/raw/'+f) for f in  os.listdir(cdd['p_d_acs'] + r'//tract/raw')],ignore_index=True,sort=False)
acs['tract']=pd.to_numeric(acs['tract'])
acs['state'] = pd.to_numeric(acs['state'])
acs['county'] = pd.to_numeric(acs['county'])
acs = downsize(df=acs.rename(columns=acs_rndict),types=acs_ddict)


acs['male_p']=acs['male']/acs['population']
acs['white_p']=acs['white']/acs['population']
acs['educ_sc_p']=acs[['educ_sc11m','educ_sc12m','educ_sc11f','educ_sc12f','educ_assocm','educ_assocf']].sum(axis=1,min_count=1)/acs['educ_total']
acs['educ_bach_p']=acs[['educ_bachm','educ_bachf','educ_grad1m','educ_grad1f','educ_grad2m','educ_grad2f','educ_grad3m','educ_grad3f']].sum(axis=1,min_count=1)/acs['educ_total']
acs['pv_total_r'] = acs[['pv_u50','pv_50to100','pv_100to125','pv_125to150','pv_150to185','pv_185to200','pv_a200']].sum(axis=1,min_count=1)
acs['pv100_p']=acs[['pv_u50','pv_50to100']].sum(axis=1,min_count=1)/acs['pv_total_r']
acs['pv200_p']=acs[['pv_u50','pv_50to100','pv_100to125','pv_125to150','pv_150to185','pv_185to200']].sum(axis=1,min_count=1)/acs['pv_total_r']
acs['pvf_total_r'] = acs[['pvf_u50','pvf_50to75','pvf_75to100','pvf_100to125','pvf_125to150','pvf_150to175','pvf_175to185','pvf_185to200','pvf_200to300','pvf_300to400','pvf_400to500','pvf_a500']].sum(axis=1,min_count=1)
acs['pvf100_p']=acs[['pvf_u50','pvf_50to75','pvf_75to100']].sum(axis=1,min_count=1)/acs['pvf_total_r']
acs['pvf200_p']=acs[['pvf_u50','pvf_50to75','pvf_75to100','pvf_100to125','pvf_125to150','pvf_150to175','pvf_175to185','pvf_185to200']].sum(axis=1,min_count=1)/acs['pvf_total_r']
acs['snap_p']=acs['snap']/acs['hh_total']
acs['aid_p']=acs['aid']/acs['aid_total']
acs['pubai_p']=acs['pubai']/acs['pubai_total']
temp = ['hi_mu6_i','hi_m6to18_i','hi_m18to25_i','hi_m26to34_i','hi_m35to44_i','hi_m45to54_i','hi_m55to64_i','hi_m65to74_i','hi_ma75_i', 'hi_fu6_i','hi_f6to18_i','hi_f18to25_i','hi_f26to34_i','hi_f35to44_i','hi_f45to54_i','hi_f55to64_i','hi_f65to74_i','hi_fa75_i']
acs['insured_p'] = acs[temp].sum(axis=1) / acs[temp + [re.sub(r'_i',r'_ni',v) for v in temp]].sum(axis=1)
acs['comm_public_p']=acs['comm_public']/acs['comm_total']
acs['comm_car_p']=acs['comm_car']/acs['comm_total']
acs['comm_timemean']=acs['comm_timeagg']/(acs['comm_total']-acs['comm_home'])
#Calculate the mean income
acs['inc_hhmean']=acs['inc_hhagg']/acs['hh_total']
acs['ownocc_p']=acs['housing_ownocc']/acs['housing_total']
acs['tract']=pd.to_numeric(acs['tract'])
#Employment variables
acs['emp_civilemp_p'] = acs['emp_civilemp'] / acs['emp_total']
acs['emp_civilunemp_p'] = acs['emp_civilunemp'] / acs['emp_total']
acs['emp_military_p'] = acs['emp_military'] / acs['emp_total']
acs['emp_nonlaborforce_p'] = acs['emp_nonlaborforce'] / acs['emp_total']
empt = ['empt_16to19','empt_20to24','empt_25to44','empt_45to54','empt_55to64','empt_65to69','empt_gte70']
acs['empt_ft_p'] = acs[[v+'_ft' for v in empt]].sum(axis=1) / acs['empt_total']
acs['empt_nft_p'] = acs[[v+'_nft' for v in empt]].sum(axis=1) / acs['empt_total']
acs['empt_nw_p'] = acs[[v+'_nw' for v in empt]].sum(axis=1) / acs['empt_total']
acs['fips'] = acs['GEO_ID'].apply(lambda x: x.split('US')[1])

keep_vars = ['tract', 'year', 'state', 'county', 'fips', 'population', 'white_p', 'male_p',
             'educ_sc_p', 'educ_bach_p', 'pvf100_p', 'pvf200_p', 'pv100_p', 'pv200_p', 'aid_p', 'snap_p', 'pubai_p', 'insured_p',
             'ownocc_p', 'comm_car_p', 'comm_public_p', 'comm_timemean', 'gini',
             'emp_civilemp_p', 'emp_civilunemp_p', 'emp_military_p', 'emp_nonlaborforce_p', 'empt_ft_p', 'empt_nft_p', 'empt_nw_p',
             'inc_hhmean', 'mhi_quin1', 'mhi_quin2', 'mhi_quin3', 'mhi_quin4', 'mhi_quin5', 'mhi_t5p']
acs[keep_vars].to_parquet(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s1.parquet')




# =============================================================================
# Concat the ACS and Census data to a single file and interpolate annually
# =============================================================================
acs=pd.read_parquet(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s1.parquet')

#Get Inflation data. (Downloaded manually from FRED: https://fred.stlouisfed.org/series/CPIAUCSL)
inf = pd.read_csv(cdd['p_d_acs'] + r'/CPIAUCSL.csv').rename(columns={'DATE':'date','CPIAUCSL':'cpi'})
inf['date'] = pd.to_datetime(inf['date'])

#Merge in inflation and deflate the dollar fields.
acs['date'] = pd.to_datetime(dict(year=acs.year, month=12, day=1))
acs=pd.merge(acs,inf,how='left',on='date')
acs['inc_hhmean']=acs['inc_hhmean'] * 262.005 /acs['cpi'] #Index inflation to Dec 2020

acs.sort_values(by=['fips','year','population'],inplace=True)
acs.drop_duplicates(subset=['fips','year'],inplace=True)

acs.to_parquet(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s2.parquet')


# =============================================================================
# Add in the annual population density with population weighted quantiles.
# =============================================================================
def weighted_qcut(values, weights, q):
    'Return weighted quantile cuts from a given series, values.'
    data=pd.DataFrame({'val':values,'w':weights.replace(0,np.nan)})
    data['index'] = 1
    data['index'] = data['index'].cumsum()
    #Collapse the duplicate values
    data_c = data.groupby(['val'])['w'].sum().reset_index()
    #Sort by the variable of interest
    data_c['wf'] = data_c['w'] / data_c['w'].sum()
    data_c.sort_values(by=['val'],inplace=True)
    data_c['wc'] = data_c['wf'].cumsum()
    if pd._libs.lib.is_integer(q):
        data_c['q'] = np.floor(data_c['wc']*q)
        data_c['q'] = data_c['q'].clip(0,q-1)
    else:
        data_c['q'] = data_c['wc'].apply(lambda x: min([v for v in q if v>x]))
    data = pd.merge(data,data_c[['val','q']],how='left',on='val')
    data['q'] = np.where((pd.isnull(data['val']))|(pd.isnull(data['w'])),np.nan,data['q'])
    return data.sort_values(by='index')['q'].values


tracts = pd.concat([gpd.read_file(cdd['p_d_geo_shp_tiger']+r'/TRACT_2019/' + f) for f in os.listdir(cdd['p_d_geo_shp_tiger']+r'/TRACT_2019/')],ignore_index=True,sort=False)
acs = pd.read_parquet(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s2.parquet')
acs = pd.merge(acs,tracts[['GEOID','ALAND']].rename(columns={'GEOID':'fips'}),how='left',on='fips')
acs['pop2sqkm'] = (acs['population'] / acs['ALAND'].apply(lambda x: max(x,10000)))*1000000
acs.replace([np.inf, -np.inf], np.nan, inplace=True)

#For each year create a temporary file and calculate the population weighted quantile bins
pop_dens = []
for y in acs['year'].unique():
    temp=acs[acs.year==y][['fips','year','population','pop2sqkm']].copy()
    temp['pop2sqkm_wq2'] = weighted_qcut(temp['pop2sqkm'],temp['population'],2)
    temp['pop2sqkm_wq3'] = weighted_qcut(temp['pop2sqkm'],temp['population'],3)
    temp['pop2sqkm_wq10'] = weighted_qcut(temp['pop2sqkm'],temp['population'],10)
    pop_dens += [temp]
pop_dens = pd.concat(pop_dens,ignore_index=True,sort=False)

acs = pd.merge(acs,pop_dens.drop(columns=['pop2sqkm','population']),how='left',on=['fips','year'])
acs.to_parquet(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s3.parquet')
acs.to_stata(cdd['p_d_acs'] +r'/tract/acs_2010_2021_s3.parquet')


